┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/eLqpV/src/mudata.jl:351
# load metabolites matrixmetmtx = biobank["metabolites_foldchange"].X[:, :][biobank.obs.kept_species,:];# too close to zero for detectable measurment, so assume no changemetmtx[isinf.(metmtx)] .=0.0; @showsize(metmtx)# filter to metabolites that have at least 10% detectable shifts in measurementmetabolite_names_full = biobank["metabolites_foldchange"].var_names.vals;keepmetabolites_mask =mapslices(c->mean(c .==0.0) <0.9, metmtx, dims=1) |> vec;metabolite_names = metabolite_names_full[keepmetabolites_mask]metmtx = metmtx[:, keepmetabolites_mask];@showsize(metmtx);
show log-2 fold change from base media across strains from 9 species with more than 20 replicates. multi-modal distributions for some distributions indicates sub-species variation in phenotype.
csb_mtx = biobank["oggs"].X[:,:]# find variable OGGsogg_mask =vec(var(csb_mtx, dims=1) .>0)# mean center variable OGGsmeancentered_csbmtx =mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)# svd of mean centered matrixbbusv_csb =svd(meancentered_csbmtx);@showsize(meancentered_csbmtx);
size(meancentered_csbmtx) = (669, 8359)
# Principal Components from SVDpcs_csb = bbusv_csb.U *Diagonal(bbusv_csb.S);# percent of variance explained by each PCpctvar = (bbusv_csb.S .^2/sum(bbusv_csb.S .^2)) *100;
map colors to NCBI families in correct order for plotting
familyid = bbobs.NCBI_Familyorderedfamilylabels =stack(DataFrame(countmap(familyid)), 1:12) |> df -> DataFrames.transform(df, :value => (-) =>:minusvalue) |> df ->sort(df, [:minusvalue, :variable], rev=false) |> df -> df.variablefamilycolors =permutedims(palette(:Set3_12).colors.colors[indexin(sort(unique(familyid)), orderedfamilylabels)]);# collect Butyrate and Succinate values for colormapbutyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)];succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)];# get color limits based on most extreme valuebutyrate_lims =getlims(butyrate_foldchange_per_strain)succinate_lims =getlims(succinate_foldchange_per_strain);
Each sub-plot shows 669 CSB strains on PC 1 (x axis) and PC 2 (y axis). Aqua dots correspond to strains belonging to the species listed in each sub-title followed by the number of strains belonging to that species. Remaining strains are shown in grey.
csb_mtx = biobank["oggs"].X[:,:][mask, :]# find variable OGGsogg_mask =vec(var(csb_mtx, dims=1) .>0)# mean center variable OGGsmeancentered_csbmtx =mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)# svd of mean centered matrixbbusv_csb =svd(meancentered_csbmtx);@showsize(meancentered_csbmtx);
size(meancentered_csbmtx) = (229, 3755)
# Principal Components from SVDpcs_csb = bbusv_csb.U *Diagonal(bbusv_csb.S);# percent of variance explained by each PCpctvar = (bbusv_csb.S .^2/sum(bbusv_csb.S .^2)) *100;
map colors to NCBI Species in correct order for plotting
genusid = biobank.obs.NCBI_Genus[mask]# collect Butyrate and Succinate values for colormapbutyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)][mask, :];succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)][mask, :];
csb_mtx = biobank["oggs"].X[:,:][mask, :]# find variable OGGsogg_mask =vec(var(csb_mtx, dims=1) .>0)# mean center variable OGGsmeancentered_csbmtx =mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)# svd of mean centered matrixbbusv_csb =svd(meancentered_csbmtx);@showsize(meancentered_csbmtx);
size(meancentered_csbmtx) = (103, 2065)
# Principal Components from SVDpcs_csb = bbusv_csb.U *Diagonal(bbusv_csb.S);# percent of variance explained by each PCpctvar = (bbusv_csb.S .^2/sum(bbusv_csb.S .^2)) *100;
map colors to NCBI Species in correct order for plotting
speciesid = biobank.obs.NCBI_Species[mask]# collect Butyrate and Succinate values for colormapbutyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)][mask, :];succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)][mask, :];
csb_mtx = biobank["oggs"].X[:,:][mask, :]# find variable OGGsogg_mask =vec(var(csb_mtx, dims=1) .>0)# mean center variable OGGsmeancentered_csbmtx =mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)# svd of mean centered matrixbbusv_csb =svd(meancentered_csbmtx);@showsize(meancentered_csbmtx);
size(meancentered_csbmtx) = (93, 1551)
# Principal Components from SVDpcs_csb = bbusv_csb.U *Diagonal(bbusv_csb.S);# percent of variance explained by each PCpctvar = (bbusv_csb.S .^2/sum(bbusv_csb.S .^2)) *100;
map colors to NCBI Species in correct order for plotting
speciesid = biobank.obs.NCBI_Species[mask]# collect Butyrate and Succinate values for colormapbutyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)][mask, :];succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)][mask, :];
Each sub-plot shows 669 CSB strains on UMAP 1 (x axis) and UMAP 2 (y axis) generated from euclidean distance across the leading 10 PCs and 77 shared nearest neighbors. Aqua dots correspond to strains belonging to the species listed in each sub-title followed by the number of strains belonging to that species. Remaining strains are shown in grey.